UsingR packagemanipulate function can help to show this # load necessary packages/install if needed
library(ggplot2); library(UsingR); data(galton)
# function to plot the histograms
myHist <- function(mu){
# calculate the mean squares
mse <- mean((galton$child - mu)^2)
# plot histogram
g <- ggplot(galton, aes(x = child)) + geom_histogram(fill = "salmon",
colour = "black", binwidth=1)
# add vertical line marking the center value mu
g <- g + geom_vline(xintercept = mu, size = 2)
g <- g + ggtitle(paste("mu = ", mu, ", MSE = ", round(mse, 2), sep = ""))
g
}
# manipulate allows the user to change the variable mu to see how the mean squares changes
# library(manipulate); manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))]
# plot the correct graph
myHist(mean(galton$child))
in order to visualize the parent-child height relationship, a scatter plot can be used
Note: because there are multiple data points for the same parent/child combination, a third dimension (size of point) should be used when constructing the scatter plot
library(dplyr)
# constructs table for different combination of parent-child height
freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child (in)", "parent (in)", "freq")
# convert to numeric values
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
# filter to only meaningful combinations
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g + scale_size(range = c(2, 20), guide = "none" )
# plot grey circles slightly larger than data as base (achieve an outline effect)
g <- g + geom_point(colour="grey50", aes(size = freq+10, show_guide = FALSE))
# plot the accurate data points
g <- g + geom_point(aes(colour=freq, size = freq))
# change the color gradient from default to lightblue-->white
g <- g + scale_colour_gradient(low = "lightblue", high="white")
g
Let \(Y = \beta X\), and \(\hat \beta\) = estimate of \(\beta\), the slope of the least square regression line \[ \begin{aligned} \sum_{i=1}^n (Y_i - X_i \beta)^2 & = \sum_{i=1}^n \left[ (Y_i - X_i \hat \beta) + (X_i \hat \beta - X_i \beta) \right]^2 \Leftarrow \mbox{added } \pm X_i \hat \beta \mbox{ is effectively adding zero}\\ (expanding~the~terms)& = \sum_{i=1}^n (Y_i - X_i \hat \beta)^2 + 2 \sum_{i=1}^n (Y_i - X_i \hat \beta)(X_i \hat \beta - X_i \beta) + \sum_{i=1}^n (X_i \hat \beta - X_i \beta)^2 \\ \sum_{i=1}^n (Y_i - X_i \beta)^2 & \geq \sum_{i=1}^n (Y_i - X_i \hat \beta)^2 + 2 \sum_{i=1}^n (Y_i - X_i \hat \beta)(X_i \hat \beta - X_i \beta) \Leftarrow \sum_{i=1}^n (X_i \hat \beta - X_i \beta)^2 \mbox{ is always positive}\\ & (ignoring~the~second~term~for~now,~for~\hat \beta ~to~be~the~minimizer~of~the~squares, \\ & the~following~must~be~true)\\ \sum_{i=1}^n (Y_i - X_i \beta)^2 & \geq \sum_{i=1}^n (Y_i - X_i \hat \beta)^2 \Leftarrow \mbox{every other } \beta \mbox{ value creates a least square criteria that is }\geq \hat \beta\\ (this~means)& \Rightarrow 2 \sum_{i=1}^n (Y_i - X_i \hat \beta)(X_i \hat \beta - X_i \beta) = 0\\ (simplifying)& \Rightarrow \sum_{i=1}^n (Y_i - X_i \hat \beta) X_i (\hat \beta - \beta) = 0 \Leftarrow (\hat \beta - \beta) \mbox{ does not depend on }i\\ (simplifying)& \Rightarrow \sum_{i=1}^n (Y_i - X_i \hat \beta) X_i = 0 \\ (solving~for~\hat \beta) & \Rightarrow \hat \beta = \frac{\sum_{i=1}^n Y_i X_i}{\sum_{i=1}^n X_i^2} = \beta\\ \end{aligned} \]
* each of the data point contributes equally to the error between the their locations and the regression line –> goal of regression is to minimize this error
coef(lm(y ~ x))) = R command to run the least square regression model on the data with y as the outcome, and x as the regressor
coef() = returns the slope and intercept coefficients of the lm results# outcome
y <- galton$child
# regressor
x <- galton$parent
# slope
beta1 <- cor(y, x) * sd(y) / sd(x)
# intercept
beta0 <- mean(y) - beta1 * mean(x)
# results are the same as using the lm command
results <- rbind("manual" = c(beta0, beta1), "lm(y ~ x)" = coef(lm(y ~ x)))
# set column names
colnames(results) <- c("intercept", "slope")
# print results
results
## intercept slope
## manual 23.94153 0.6462906
## lm(y ~ x) 23.94153 0.6462906
lm(y ~ x - 1) = forces a regression line to go through the origin (0, 0)# centering y
yc <- y - mean(y)
# centering x
xc <- x - mean(x)
# slope
beta1 <- sum(yc * xc) / sum(xc ^ 2)
# results are the same as using the lm command
results <- rbind("centered data (manual)" = beta1, "lm(y ~ x)" = coef(lm(y ~ x))[2],
"lm(yc ~ xc - 1)" = coef(lm(yc ~ xc - 1))[1])
# set column names
colnames(results) <- c("slope")
# print results
results
## slope
## centered data (manual) 0.6462906
## lm(y ~ x) 0.6462906
## lm(yc ~ xc - 1) 0.6462906
# normalize y
yn <- (y - mean(y))/sd(y)
# normalize x
xn <- (x - mean(x))/sd(x)
# compare correlations
results <- rbind("cor(y, x)" = cor(y, x), "cor(yn, xn)" = cor(yn, xn),
"slope" = coef(lm(yn ~ xn))[2])
# print results
results
## xn
## cor(y, x) 0.4587624
## cor(yn, xn) 0.4587624
## slope 0.4587624
geom_smooth(method = "lm", formula = y~x) function in ggplot2 = adds regression line and confidence interval to graph
formula = y~x = default for the line (argument can be eliminated if y~x produces the line you want)# constructs table for different combination of parent-child height
freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child (in)", "parent (in)", "freq")
# convert to numeric values
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+10, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")
g <- g + geom_smooth(method="lm", formula=y~x)
g
# load father.son data
data(father.son)
# normalize son's height
y <- (father.son$sheight - mean(father.son$sheight)) / sd(father.son$sheight)
# normalize father's height
x <- (father.son$fheight - mean(father.son$fheight)) / sd(father.son$fheight)
# calculate correlation
rho <- cor(x, y)
# plot the relationship between the two
g = ggplot(data.frame(x = x, y = y), aes(x = x, y = y))
g = g + geom_point(size = 3, alpha = .2, colour = "black")
g = g + geom_point(size = 2, alpha = .2, colour = "red")
g = g + xlim(-4, 4) + ylim(-4, 4)
# reference line for perfect correlation between
# variables (data points will like on diagonal line)
g = g + geom_abline(position = "identity")
# if there is no correlation between the two variables,
# the data points will lie on horizontal/vertical lines
g = g + geom_vline(xintercept = 0)
g = g + geom_hline(yintercept = 0)
# plot the actual correlation for both
g = g + geom_abline(intercept = 0, slope = rho, size = 2)
g = g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
# add appropriate labels
g = g + xlab("Father's height, normalized")
g = g + ylab("Son's height, normalized")
g = g + geom_text(x = 3.8, y = 1.6, label="x~y", angle = 25) +
geom_text(x = 3.2, y = 3.6, label="cor(y,x)=1", angle = 35) +
geom_text(x = 1.6, y = 3.8, label="y~x", angle = 60)
g
summary(lm))
diamond dataset from UsingR package
lm(price ~ I(carat - mean(carat)), data=diamond) = mean centered linear regression
I() to work predict(fitModel, newdata=data.frame(carat=c(0, 1, 2))) = returns predicted outcome from the given model (linear in our case) at the provided points within the newdata data frame
newdata is unspecified (argument omitted), then predict function will return predicted values for all values of the predictor (x variable, carat in this case)
newdata has to be a dataframe, and the values you would like to predict (x variable, carat in this case) has to be specified, or the system won’t know what to do with the provided values summary(fitModel) = prints detailed summary of linear model# standard linear regression for price vs carat
fit <- lm(price ~ carat, data = diamond)
# intercept and slope
coef(fit)
## (Intercept) carat
## -259.6259 3721.0249
# mean-centered regression
fit2 <- lm(price ~ I(carat - mean(carat)), data = diamond)
# intercept and slope
coef(fit2)
## (Intercept) I(carat - mean(carat))
## 500.0833 3721.0249
# regression with more granular scale (1/10th carat)
fit3 <- lm(price ~ I(carat * 10), data = diamond)
# intercept and slope
coef(fit3)
## (Intercept) I(carat * 10)
## -259.6259 372.1025
# predictions for 3 values
newx <- c(0.16, 0.27, 0.34)
# manual calculations
coef(fit)[1] + coef(fit)[2] * newx
## [1] 335.7381 745.0508 1005.5225
# prediction using the predict function
predict(fit, newdata = data.frame(carat = newx))
## 1 2 3
## 335.7381 745.0508 1005.5225
# plot the data points
plot(diamond$carat, diamond$price, xlab = "Mass (carats)", ylab = "Price (SIN $)",
bg = "lightblue", col = "black", cex = 1.1, pch = 21,frame = FALSE)
# plot linear regression line
abline(fit, lwd = 2)
# plot predictions for every value of carat (in red)
points(diamond$carat, predict(fit), pch = 19, col = "red")
# add guidelines for predictions for 0.16, 0.27, and 0.34
lines(c(0.16, 0.16, 0.12), c(200, coef(fit)[1] + coef(fit)[2] * 0.16,
coef(fit)[1] + coef(fit)[2] * 0.16))
lines(c(0.27, 0.27, 0.12), c(200, coef(fit)[1] + coef(fit)[2] * 0.27,
coef(fit)[1] + coef(fit)[2] * 0.27))
lines(c(0.34, 0.34, 0.12), c(200, coef(fit)[1] + coef(fit)[2] * 0.34,
coef(fit)[1] + coef(fit)[2] * 0.34))
# add text labels
text(newx+c(0.03, 0, 0), rep(250, 3), labels = newx, pos = 2)
* the probability density function (pdf) for an outcome \(x\) from the normal distribution is defined as \[f(x~|~\mu, \sigma^2) = (2\pi \sigma^2)^{-1/2}\exp \left(- \frac{1}{2\sigma^2}(y_i - \mu_i)^2\right)\] * the corresponding pdf for \(n\) iid normal random outcomes \(x_1, \ldots, x_n\) is defined as \[f(x_1, \ldots, x_n~|~\mu, \sigma^2) = \prod_{i=1}^n (2\pi \sigma^2)^{-1/2}\exp \left(- \frac{1}{2\sigma^2}(y_i - \mu_i)^2\right)\] which is also known as the likelihood function, denoted in this case as \(\mathcal{L}(\mu, \sigma)\) * to find the maximum likelihood estimator (MLE) of the mean, \(\mu\), we take the derivative of the likelihood \(\mathcal{L}\) with respect to \(\mu\) –> \(\frac{\partial \mathcal{L}}{\partial \mu}\) * since derivatives of products are quite complex to compute, taking the \(\log\) (base \(e\)) makes the calculation much simpler - \(\log\) properties: + \(\log(AB) = \log(A) + \log(B)\) + \(\log(A^B) = B\log(A)\) - because \(\log\) is always increasing and monotonic, or preserves order, finding the maximum MLE = finding th maximum of \(\log\) transformation of MLE * -2 \(\log\) of likelihood function \[
\begin{aligned}
\log(\mathcal{L}(\mu, \sigma)) & = \sum_{i=1}^n -\frac{1}{2}\log(2\pi \sigma^2) - \frac{1}{2\sigma^2}(y_i - \mu_i)^2 \Leftarrow \mbox{multiply -2 on both sides} \\
-2\log(\mathcal{L}(\mu, \sigma)) & = \sum_{i=1}^n \log(2\pi \sigma^2) + \frac{1}{\sigma^2}(y_i - \mu_i)^2 \Leftarrow \sigma^2 \mbox{ does not depend on }i \\
-2\log(\mathcal{L}(\mu, \sigma)) & = n\log(2\pi \sigma^2) + \frac{1}{\sigma^2}\sum_{i=1}^n (y_i - \mu_i)^2\\
\end{aligned}
\] * minimizing -2 \(\log\) likelihood is computationally equivalent as maximizing \(\log\) likelihood \[
\begin{aligned}
\frac{\partial \log(\mathcal{L}(\mu, \sigma))}{\partial \mu} & = \frac{1}{\sigma^2} \frac{\partial \sum_{i=1}^n (y_i - \mu_i)^2}{\partial \mu} = 0 \Leftarrow \mbox{setting this equal to 0 to find minimum}\\
\Rightarrow -\frac{2}{\sigma^2}\sum_{i=1}^n (y_i - \mu_i) & = 0 \Leftarrow \mbox{divide by }-2/\sigma^2 \mbox{ on both sides}\\
\sum_{i=1}^n (y_i - \mu_i) & = 0 \Leftarrow \mbox{move } \mu \mbox{ term over to the right}\\
\sum_{i=1}^n y_i & = \sum_{i=1}^n \mu_i \Leftarrow \mbox{for the two sums to be equal, all the terms must be equal}\\
y_i & = \mu_i \\
\end{aligned}
\] * in the case of our linear regression, \(\mu_i = \beta_0 + \beta_1 X_i\) so \[
\begin{aligned}
\frac{\partial \mathcal{L}(\mu, \sigma))}{\partial \mu} = \frac{\partial \mathcal{L}(\beta_0, \beta_1, \sigma))}{\partial \mu} \\
\mbox{MLE} \Rightarrow Y_i & = \mu_i \\
\mu_i &= \beta_0 + \beta_1 X_i\\
\end{aligned}
\]
mean(fitModel$residuals) = returns mean of residuals –> should equal to 0cov(fit$residuals, predictors) = returns the covariance (measures correlation) of residuals and predictors –> should also equal to 0deviance(fitModel) = calculates sum of the squared error/residual for the linear model/residual variationsummary(fitModel)$sigma = returns the residual variation of a fit model or the unbiased RMSE
summary(fitModel) = creates a list of different parameters of the fit model# get data
y <- diamond$price; x <- diamond$carat; n <- length(y)
# linear fit
fit <- lm(y ~ x)
# calculate residual variation through summary and manual
rbind("from summary" = summary(fit)$sigma, "manual" =sqrt(sum(resid(fit)^2) / (n - 2)))
## [,1]
## from summary 31.84052
## manual 31.84052
cor(outcome, predictor = calculates the correlation between predictor and outcome –> the same as calculating \(R^2\)example(anscombe) demonstrates the fallacy of \(R^2\) through the following
relationship between \(R^2\) and \(r\) \[ \begin{aligned} \mbox{Correlation between X and Y} \Rightarrow r & = Cor(Y, X)\\ R^2 & = \frac{\sum_{i=1}^n (\hat Y_i - \bar Y)^2}{\sum_{i=1}^n (Y_i - \bar Y)^2} \\ &recall \Rightarrow (\hat Y_i - \bar Y) = \hat \beta_1 (X_i - \bar X)\\ (substituting (\hat Y_i - \bar Y)) & = \hat \beta_1^2 \frac{\sum_{i=1}^n(X_i - \bar X)^2}{\sum_{i=1}^n (Y_i - \bar Y)^2}\\ &recall \Rightarrow \hat \beta_1 = Cor(Y, X)\frac{Sd(Y)}{Sd(X)}\\ (substituting \hat \beta_1) & = Cor(Y, X)^2\frac{Var(Y)}{Var(X)} \times \frac{\sum_{i=1}^n(X_i - \bar X)^2}{\sum_{i=1}^n (Y_i - \bar Y)^2}\\ & recall Var(Y) = \sum_{i=1}^n (Y_i - \bar Y)^2 and Var(X) = \sum_{i=1}^n (X_i - \bar X)^2\\ (simplifying) \Rightarrow R^2 &= Cor(Y,X)^2\\ \mbox{Or } R^2 & = r^2\\ \end{aligned} \]
total variation derivation \[ \begin{aligned} \mbox{First, we know that } \bar Y & = \hat \beta_0 + \hat \beta_1 \bar X \\ (transforming) \Rightarrow \hat \beta_0 & = \bar Y - \hat \beta_1 \bar X \\ \\ \mbox{We also know that } \hat Y_i & = \hat \beta_0 + \hat \beta_1 X_i \\ \\ \mbox{Next, the residual } = (Y_i - \hat Y_i) & = Y_i - \hat \beta_0 - \hat \beta_1 X_i \\ (substituting~\hat \beta_0) & = Y_i - (\bar Y - \hat \beta_1 \bar X) - \hat \beta_1 X_i \\ (transforming) \Rightarrow (Y_i - \hat Y_i) & = (Y_i - \bar Y) - \hat \beta_1 (X_i - \bar X) \\ \\ \mbox{Next, the regression difference } = (\hat Y_i - \bar Y) & = \hat \beta_0 - \hat \beta_1 X_i -\bar Y \\ (substituting~\hat \beta_0) & = \bar Y - \hat \beta_1 \bar X - \hat \beta_1 X_i - \bar Y \\ (transforming) \Rightarrow (\hat Y_i - \bar Y) & = \hat \beta_1 (X_i - \bar X) \\ \\ \mbox{Total Variation} = \sum_{i=1}^n (Y_i - \bar Y)^2 & = \sum_{i=1}^n (Y_i - \hat Y_i + \hat Y_i - \bar Y)^2 \Leftarrow (adding~\pm \hat Y_i) \\ (expanding) & = \sum_{i=1}^n (Y_i - \hat Y_i)^2 + 2 \sum_{i=1}^n (Y_i - \hat Y_i)(\hat Y_i - \bar Y) + \sum_{i=1}^n (\hat Y_i - \bar Y)^2 \\ \\ \mbox{Looking at } \sum_{i=1}^n (Y_i - \hat Y_i)(\hat Y_i - \bar Y) & \\ (substituting~(Y_i - \hat Y_i),(\hat Y_i - \bar Y)) &= \sum_{i=1}^n \left[(Y_i - \bar Y) - \hat \beta_1 (X_i - \bar X))\right]\left[\hat \beta_1 (X_i - \bar X)\right]\\ (expanding) & = \hat \beta_1 \sum_{i=1}^n (Y_i - \bar Y)(X_i - \bar X) -\hat\beta_1^2\sum_{i=1}^n (X_i - \bar X)^2\\ & (substituting~Y_i, \bar Y) \Rightarrow (Y_i - \bar Y) = (\hat \beta_0 + \hat \beta_1 X_i) - (\hat \beta_0 + \hat \beta_1 \bar X) \\ & (simplifying) \Rightarrow (Y_i - \bar Y) = \hat \beta_1 (X_i - \bar X) \\ (substituting~(Y_i - \bar Y)) & = \hat \beta_1^2 \sum_{i=1}^n (X_i - \bar X)^2-\hat\beta_1^2\sum_{i=1}^n (X_i - \bar X)^2\\ \Rightarrow \sum_{i=1}^n (Y_i - \hat Y_i)(\hat Y_i - \bar Y) &= 0 \\ \\ \mbox{Going back to} \sum_{i=1}^n (Y_i - \bar Y)^2 &= \sum_{i=1}^n (Y_i - \hat Y_i)^2 + 2 \sum_{i=1}^n (Y_i - \hat Y_i)(\hat Y_i - \bar Y) + \sum_{i=1}^n (\hat Y_i - \bar Y)^2 \\ (since~second~term= 0) \Rightarrow \sum_{i=1}^n (Y_i - \bar Y)^2 &= \sum_{i=1}^n (Y_i - \hat Y_i)^2 + \sum_{i=1}^n (\hat Y_i - \bar Y)^2\\ \end{aligned} \]
resid(fitModel) or fitModel$residuals = extracts model residuals from the fit model (lm in our case) –> list of residual values for every value of Xsummary(fitModel)$r.squared = return \(R^2\) value of the regression model# load multiplot function
source("multiplot.R")
# get data
y <- diamond$price; x <- diamond$carat; n <- length(y)
# linear regression
fit <- lm(y ~ x)
# calculate residual
e <- resid(fit)
# calculate predicted values
yhat <- predict(fit)
# create 1 x 2 panel plot
par(mfrow=c(1, 2))
# plot residuals on regression line
plot(x, y, xlab = "Mass (carats)", ylab = "Price (SIN $)", bg = "lightblue",
col = "black", cex = 2, pch = 21,frame = FALSE,main = "Residual on Regression Line")
# draw linear regression line
abline(fit, lwd = 2)
# draw red lines from data points to regression line
for (i in 1 : n){lines(c(x[i], x[i]), c(y[i], yhat[i]), col = "red" , lwd = 2)}
# plot residual vs x
plot(x, e, xlab = "Mass (carats)", ylab = "Residuals (SIN $)", bg = "lightblue",
col = "black", cex = 2, pch = 21,frame = FALSE,main = "Residual vs X")
# draw horizontal line
abline(h = 0, lwd = 2)
# draw red lines from residual to x axis
for (i in 1 : n){lines(c(x[i], x[i]), c(e[i], 0), col = "red" , lwd = 2)}
# create sin wave pattern
x <- runif(100, -3, 3); y <- x + sin(x) + rnorm(100, sd = .2);
# plot data + regression
g <- ggplot(data.frame(x = x, y = y), aes(x = x, y = y)) +
geom_smooth(method = "lm", colour = "black") +
geom_point(size = 3, colour = "black", alpha = 0.4) +
geom_point(size = 2, colour = "red", alpha = 0.4)+
ggtitle("Residual on Regression Line")
# plot residuals
f <- ggplot(data.frame(x = x, y = resid(lm(y ~ x))), aes(x = x, y = y))+
geom_hline(yintercept = 0, size = 2)+
geom_point(size = 3, colour = "black", alpha = 0.4)+
geom_point(size = 2, colour = "red", alpha = 0.4)+
xlab("X") + ylab("Residual")+ ggtitle("Residual vs X")
multiplot(g, f, cols = 2)
# create heteroskedastic data
x <- runif(100, 0, 6); y <- x + rnorm(100, mean = 0, sd = .001 * x)
# plot data + regression
g <- ggplot(data.frame(x = x, y = y), aes(x = x, y = y))+
geom_smooth(method = "lm", colour = "black")+
geom_point(size = 3, colour = "black", alpha = 0.4)+
geom_point(size = 2, colour = "red", alpha = 0.4) +
ggtitle("Residual on Regression Line")
# plot residuals
f <- ggplot(data.frame(x = x, y = resid(lm(y ~ x))), aes(x = x, y = y))+
geom_hline(yintercept = 0, size = 2) +
geom_point(size = 3, colour = "black", alpha = 0.4)+
geom_point(size = 2, colour = "red", alpha = 0.4)+
xlab("X") + ylab("Residual") + ggtitle("Residual vs X")
# combine two plots
multiplot(g, f, cols = 2)
standard errors for coefficients \[\begin{aligned} Var(\hat \beta_1) & = Var\left(\frac{\sum_{i=1}^n (Y_i - \bar Y)(X_i - \bar X)}{((X_i - \bar X)^2)}\right) \\ (expanding) & = Var\left(\frac{\sum_{i=1}^n Y_i (X_i - \bar X) - \bar Y \sum_{i=1}^n (X_i - \bar X)}{((X_i - \bar X)^2)}\right) \\ & Since~ \sum_{i=1}^n X_i - \bar X = 0 \\ (simplifying) & = \frac{\sum_{i=1}^n Y_i (X_i - \bar X)}{(\sum_{i=1}^n (X_i - \bar X)^2)^2} \Leftarrow \mbox{denominator taken out of } Var\\ (Var(Y_i) = \sigma^2) & = \frac{\sigma^2 \sum_{i=1}^n (X_i - \bar X)^2}{(\sum_{i=1}^n (X_i - \bar X)^2)^2} \\ \sigma_{\hat \beta_1}^2 = Var(\hat \beta_1) &= \frac{\sigma^2 }{ \sum_{i=1}^n (X_i - \bar X)^2 }\\ \Rightarrow \sigma_{\hat \beta_1} &= \frac{\sigma}{ \sum_{i=1}^n X_i - \bar X} \\ \\ \mbox{by the same derivation} \Rightarrow & \\ \sigma_{\hat \beta_0}^2 = Var(\hat \beta_0) & = \left(\frac{1}{n} + \frac{\bar X^2}{\sum_{i=1}^n (X_i - \bar X)^2 }\right)\sigma^2 \\ \Rightarrow \sigma_{\hat \beta_0} &= \sigma \sqrt{\frac{1}{n} + \frac{\bar X^2}{\sum_{i=1}^n (X_i - \bar X)^2 }}\\ \end{aligned}\]
summary(fitModel)$coefficients = returns table summarizing the estimate, standar error, t value and p value for the coefficients \(\beta_0\) and \(\beta_1\)
summary(fitModel)$coefficients producesexample
# getting data
y <- diamond$price; x <- diamond$carat; n <- length(y)
# calculate beta1
beta1 <- cor(y, x) * sd(y) / sd(x)
# calculate beta0
beta0 <- mean(y) - beta1 * mean(x)
# Gaussian regression error
e <- y - beta0 - beta1 * x
# unbiased estimate for variance
sigma <- sqrt(sum(e^2) / (n-2))
# (X_i - X Bar)
ssx <- sum((x - mean(x))^2)
# calculate standard errors
seBeta0 <- (1 / n + mean(x) ^ 2 / ssx) ^ .5 * sigma
seBeta1 <- sigma / sqrt(ssx)
# testing for H0: beta0 = 0 and beta0 = 0
tBeta0 <- beta0 / seBeta0; tBeta1 <- beta1 / seBeta1
# calculating p-values for Ha: beta0 != 0 and beta0 != 0 (two sided)
pBeta0 <- 2 * pt(abs(tBeta0), df = n - 2, lower.tail = FALSE)
pBeta1 <- 2 * pt(abs(tBeta1), df = n - 2, lower.tail = FALSE)
# store results into table
coefTable <- rbind(c(beta0, seBeta0, tBeta0, pBeta0), c(beta1, seBeta1, tBeta1, pBeta1))
colnames(coefTable) <- c("Estimate", "Std. Error", "t value", "P(>|t|)")
rownames(coefTable) <- c("(Intercept)", "x")
# print table
coefTable
## Estimate Std. Error t value P(>|t|)
## (Intercept) -259.6259 17.31886 -14.99094 2.523271e-19
## x 3721.0249 81.78588 45.49715 6.751260e-40
# regression model and the generated table from lm (identical to above)
fit <- lm(y ~ x); summary(fit)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -259.6259 17.31886 -14.99094 2.523271e-19
## x 3721.0249 81.78588 45.49715 6.751260e-40
# store results in matrix
sumCoef <- summary(fit)$coefficients
# print out confidence interval for beta0
sumCoef[1,1] + c(-1, 1) * qt(.975, df = fit$df) * sumCoef[1, 2]
## [1] -294.4870 -224.7649
# print out confidence interval for beta1 in 1/10 units
(sumCoef[2,1] + c(-1, 1) * qt(.975, df = fit$df) * sumCoef[2, 2]) / 10
## [1] 355.6398 388.5651
predict(fitModel, data, interval = ("confidence")) = returns a 3-column matrix with data for fit (regression line), lwr (lower bound of interval), and upr (upper bound of interval)
interval = ("confidence") = returns interval for the lineinterval = ("prediction") = returns interval for the predictiondata = must be a new data frame with the values you would like to predictggplot2)# create a sequence of values that we want to predict at
newx = data.frame(x = seq(min(x), max(x), length = 100))
# calculate values for both intervals
p1 = data.frame(predict(fit, newdata= newx,interval = ("confidence")))
p2 = data.frame(predict(fit, newdata = newx,interval = ("prediction")))
# add column for interval labels
p1$interval = "confidence"; p2$interval = "prediction"
# add column for the x values we want to predict
p1$x = newx$x; p2$x = newx$x
# combine the two dataframes
dat = rbind(p1, p2)
# change the name of the first column to y
names(dat)[1] = "y"
# plot the data
g <- ggplot(dat, aes(x = x, y = y))
g <- g + geom_ribbon(aes(ymin = lwr, ymax = upr, fill = interval), alpha = 0.2)
g <- g + geom_line()
g + geom_point(data = data.frame(x = x, y=y), aes(x = x, y = y), size = 4)
* example (
base)
# plot the x and y values
plot(x,y,frame=FALSE,xlab="Carat",ylab="Dollars",pch=21,col="black",bg="lightblue",cex=2)
# add the fit line
abline(fit,lwd=2)
# create sequence of x values that we want to predict at
xVals<-seq(min(x),max(x),by=.01)
# calculate the predicted y values
yVals<-beta0+beta1*xVals
# calculate the standard errors for the interval for the line
se1<-sigma*sqrt(1/n+(xVals-mean(x))^2/ssx)
# calculate the standard errors for the interval for the predicted values
se2<-sigma*sqrt(1+1/n+(xVals-mean(x))^2/ssx)
# plot the upper and lower bounds of both intervals
lines(xVals,yVals+2*se1); lines(xVals,yVals-2*se1)
lines(xVals,yVals+2*se2); lines(xVals,yVals-2*se2)
we can hold \(\hat \beta_1\) fixed and solve (2) for \(\hat \beta_2\) \[ \begin{aligned} \sum_{i=1}^n (Y_i - X_{1i}\hat \beta_1) X_{2i} - \sum_{i=1}^n X_{2i}^2 \hat \beta_2& = 0\\ \sum_{i=1}^n (Y_i - X_{1i}\hat \beta_1) X_{2i} &= \sum_{i=1}^n X_{2i}^2 \hat \beta_2 \\ \hat \beta_2 & = \frac{\sum_{i=1}^n (Y_i - X_{1i}\hat \beta_1) X_{2i}}{\sum_{i=1}^n X_{2i}^2} \\ \end{aligned} \]
we can rewrite \[\sum_{i=1}^n \left[\left(Y_i - \frac{\sum_{j=1}^n Y_jX_{2j}}{\sum_{j=1}^n X_{2j}^2}X_{2i}\right) - \hat \beta_1 \left(X_{1i} - \frac{\sum_{j=1}^n X_{1j} X_{2j}}{\sum_{j=1}^n X_{2j}^2}X_{2i}\right)\right] X_{1i} = 0 ~~~~~~(3)\] as \[ \sum_{i=1}^n \left[ e_{i, Y|X_1} -\hat \beta_1 e_{i, X_1|X_2} \right] X_{1i}= 0\] where \[e_{i, a|b} = a_i - \frac{ \sum_{j=1}^n a_j b_j}{ \sum_{j=1}^n b_j^2}b_i\] which is interpreted as the residual when regressing b from a without an intercept
solving (3), we get \[\hat \beta_1 = \frac{\sum_{i=1}^n e_{i, Y | X_2} e_{i, X_1 | X_2}}{\sum_{i=1}^n e_{i, X_1 | X_2}X_{1i}} ~~~~~~(4)\]
to simplify the denominator, we will look at \[ \begin{aligned} \sum_{i=1}^n e_{i, X_1 | X_2}^2 & = \sum_{i=1}^n e_{i, X_1 | X_2}\left(X_{1i} - \frac{\sum_{j=1}^n X_{1j}X_{2j} }{ \sum_{j=1}^n X_{2j}^2 } X_{2i}\right) \\ & = \sum_{i=1}^n e_{i, X_1 | X_2}X_{1i} - \frac{\sum_{j=1}^n X_{1j}X_{2j} }{ \sum_{j=1}^n X_{2j}^2 } \sum_{i=1}^n e_{i, X_1 | X_2} X_{2i}\\ & (recall~that~ \sum_{i=1}^n e_{i}X_i = 0,~so~the~2^{nd}~term~is~0) \\ \Rightarrow \sum_{i=1}^n e_{i, X_1 | X_2}^2 & = \sum_{i=1}^n e_{i, X_1 | X_2}X_{1i}\\ \end{aligned} \]
plugging the above equation back in to (4), we get \[\hat \beta_1 = \frac{\sum_{i=1}^n e_{i, Y | X_2} e_{i, X_1 | X_2}}{\sum_{i=1}^n e_{i, X_1 | X_2}^2}\]
this is interpreted as the effect of variable \(X_1\) when the effects of all other variables have been removed from \(X_1\) and the predicted result \(Y\) (holding everything else constant/adjusting for all other variables)
# simulate the data
n = 100; x = rnorm(n); x2 = rnorm(n); x3 = rnorm(n)
# equation = intercept + var1 + var2 + var3 + error
y = 1 + x + x2 + x3 + rnorm(n, sd = .1)
# residual of y regressed on var2 and var3
ey = resid(lm(y ~ x2 + x3))
# residual of y regressed on var2 and var3
ex = resid(lm(x ~ x2 + x3))
# estimate beta1 for var1
sum(ey * ex) / sum(ex ^ 2)
## [1] 1.008907
# regression through the origin with xva1 with var2 var3 effect removed
coef(lm(ey ~ ex - 1))
## ex
## 1.008907
# regression for all three variables
coef(lm(y ~ x + x2 + x3))
## (Intercept) x x2 x3
## 1.0048482 1.0089070 0.9849630 0.9898807
GGally package] ggpairs(data) = produces pair wise plot for the predictors similar to pairs in base package# load dataset
require(datasets); data(swiss); require(GGally)
# produce pairwise plot using ggplot2
ggpairs(swiss, lower = list(continuous = "smooth"),params = c(method = "loess"))
# print coefficients of regression of fertility on all predictors
summary(lm(Fertility ~ . , data = swiss))$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.9151817 10.70603759 6.250229 1.906051e-07
## Agriculture -0.1721140 0.07030392 -2.448142 1.872715e-02
## Examination -0.2580082 0.25387820 -1.016268 3.154617e-01
## Education -0.8709401 0.18302860 -4.758492 2.430605e-05
## Catholic 0.1041153 0.03525785 2.952969 5.190079e-03
## Infant.Mortality 1.0770481 0.38171965 2.821568 7.335715e-03
# run marginal regression on Agriculture
summary(lm(Fertility ~ Agriculture, data = swiss))$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.3043752 4.25125562 14.185074 3.216304e-18
## Agriculture 0.1942017 0.07671176 2.531577 1.491720e-02
# simulate data
n <- 100; x2 <- 1 : n; x1 <- .01 * x2 + runif(n, -.1, .1); y = -x1 + x2 + rnorm(n, sd = .01)
# print coefficients
c("with x1" = summary(lm(y ~ x1))$coef[2,1],
"with x1 and x2" = summary(lm(y ~ x1 + x2))$coef[2,1])
## with x1 with x1 and x2
## 95.240715 -1.002281
# print p-values
c("with x1" = summary(lm(y ~ x1))$coef[2,4],
"with x1 and x2" = summary(lm(y ~ x1 + x2))$coef[2,4])
## with x1 with x1 and x2
## 7.184852e-72 1.979912e-72
# store all data in one data frame (ey and ex1 are residuals with respect to x2)
dat <- data.frame(y = y, x1 = x1, x2 = x2, ey = resid(lm(y ~ x2)), ex1 = resid(lm(x1 ~ x2)))
# plot y vs x1
g <- ggplot(dat, aes(y = y, x = x1, colour = x2)) +
geom_point(colour="grey50", size = 2) +
geom_smooth(method = lm, se = FALSE, colour = "black") + geom_point(size = 1.5) +
ggtitle("unadjusted = y vs x1")
# plot residual of y adjusted for x2 vs residual of x1 adjusted for x2
g2 <- ggplot(dat, aes(y = ey, x = ex1, colour = x2)) +
geom_point(colour="grey50", size = 2) +
geom_smooth(method = lm, se = FALSE, colour = "black") + geom_point(size = 1.5) +
ggtitle("adjusted = y, x1 residuals with x2 removed") + labs(x = "resid(x1~x2)",
y = "resid(y~x2)")
# combine plots
multiplot(g, g2, cols = 2)
y and x1 flips signs when adjusting for x2this effectively means that within each consecutive group/subset of points (each color gradient) on the left hand plot (unadjusted), there exists a negative relationship between the points while the overall trend is going up
Note: it is difficult to interpret and determine which one is the correct model –> one should not claim positive correlation between Agriculture and Fertility simply based on marginal regression lm(Fertility ~ Agriculture, data=swiss)
NA as their coefficients# add a linear combination of agriculture and education variables
z <- swiss$Agriculture + swiss$Education
# run linear regression with unnecessary variables
lm(Fertility ~ . + z, data = swiss)$coef
## (Intercept) Agriculture Examination Education
## 66.9151817 -0.1721140 -0.2580082 -0.8709401
## Catholic Infant.Mortality z
## 0.1041153 1.0770481 NA
z by excluding it from the linear regression –> z’s coefficient is NAInsectSprays data set
# load insect spray data
data(InsectSprays)
ggplot(data = InsectSprays, aes(y = count, x = spray, fill = spray)) +
geom_violin(colour = "black", size = 2) + xlab("Type of spray") +
ylab("Insect count")
Linear model fit with group A as reference category
# linear fit with 5 dummy variables
summary(lm(count ~ spray, data = InsectSprays))$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.5000000 1.132156 12.8074279 1.470512e-19
## sprayB 0.8333333 1.601110 0.5204724 6.044761e-01
## sprayC -12.4166667 1.601110 -7.7550382 7.266893e-11
## sprayD -9.5833333 1.601110 -5.9854322 9.816910e-08
## sprayE -11.0000000 1.601110 -6.8702352 2.753922e-09
## sprayF 2.1666667 1.601110 1.3532281 1.805998e-01
Hard-coding the dummy variables
lm(count ~ spray, data = InsectSprays)# hard coding dummy variables
lm(count ~ I(1 * (spray == 'B')) + I(1 * (spray == 'C')) +
I(1 * (spray == 'D')) + I(1 * (spray == 'E')) +
I(1 * (spray == 'F')), data = InsectSprays)$coefficients
## (Intercept) I(1 * (spray == "B")) I(1 * (spray == "C"))
## 14.5000000 0.8333333 -12.4166667
## I(1 * (spray == "D")) I(1 * (spray == "E")) I(1 * (spray == "F"))
## -9.5833333 -11.0000000 2.1666667
Linear model fit with all 6 categories
# linear fit with 6 dummy variables
lm(count ~ I(1 * (spray == 'B')) + I(1 * (spray == 'C')) +
I(1 * (spray == 'D')) + I(1 * (spray == 'E')) +
I(1 * (spray == 'F')) + I(1 * (spray == 'A')),
data = InsectSprays)$coefficients
## (Intercept) I(1 * (spray == "B")) I(1 * (spray == "C"))
## 14.5000000 0.8333333 -12.4166667
## I(1 * (spray == "D")) I(1 * (spray == "E")) I(1 * (spray == "F"))
## -9.5833333 -11.0000000 2.1666667
## I(1 * (spray == "A"))
## NA
A is NALinear model fit with omitted intercept
# linear model with omitted intercept
summary(lm(count ~ spray - 1, data = InsectSprays))$coefficients
## Estimate Std. Error t value Pr(>|t|)
## sprayA 14.500000 1.132156 12.807428 1.470512e-19
## sprayB 15.333333 1.132156 13.543487 1.001994e-20
## sprayC 2.083333 1.132156 1.840148 7.024334e-02
## sprayD 4.916667 1.132156 4.342749 4.953047e-05
## sprayE 3.500000 1.132156 3.091448 2.916794e-03
## sprayF 16.666667 1.132156 14.721181 1.573471e-22
# actual means of count by each variable
round(tapply(InsectSprays$count, InsectSprays$spray, mean), 2)
## A B C D E F
## 14.50 15.33 2.08 4.92 3.50 16.67
to compare between different categories, say B vs C, we can simply subtract the coefficients
relevel(var, "l") = reorders the factor levels within the factor variable var such that the specified level “l” is the reference/base/lowest level
# reorder the levels of spray variable such that C is the lowest level
spray2 <- relevel(InsectSprays$spray, "C")
# rerun linear regression with releveled factor
summary(lm(count ~ spray2, data = InsectSprays))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.083333 1.132156 1.840148 7.024334e-02
## spray2A 12.416667 1.601110 7.755038 7.266893e-11
## spray2B 13.250000 1.601110 8.275511 8.509776e-12
## spray2D 2.833333 1.601110 1.769606 8.141205e-02
## spray2E 1.416667 1.601110 0.884803 3.794750e-01
## spray2F 14.583333 1.601110 9.108266 2.794343e-13
Numeric = values for children aged <5 years underweight (%)Sex = records whetherYear = year when data was recordedIncome = income for the child’s parents# load in hunger data
hunger <- read.csv("hunger.csv")
# exclude the data with "Both Sexes" as values (only want Male vs Female)
hunger <- hunger[hunger$Sex!="Both sexes", ]
# structure of data
str(hunger)
## 'data.frame': 948 obs. of 12 variables:
## $ Indicator : Factor w/ 1 level "Children aged <5 years underweight (%)": 1 1 1 1 1 1 1 1 1 1 ...
## $ Data.Source : Factor w/ 670 levels "NLIS_310005",..: 7 52 519 380 548 551 396 503 643 632 ...
## $ PUBLISH.STATES: Factor w/ 1 level "Published": 1 1 1 1 1 1 1 1 1 1 ...
## $ Year : int 1986 1990 2005 2002 2008 2008 2003 2006 2012 1999 ...
## $ WHO.region : Factor w/ 6 levels "Africa","Americas",..: 1 2 2 3 1 1 1 2 1 2 ...
## $ Country : Factor w/ 151 levels "Afghanistan",..: 115 104 97 69 57 54 71 13 115 144 ...
## $ Sex : Factor w/ 3 levels "Both sexes","Female",..: 3 3 3 2 2 3 3 3 3 2 ...
## $ Display.Value : num 19.3 2.2 5.3 3.2 17 15.7 19.3 4 15.5 4.2 ...
## $ Numeric : num 19.3 2.2 5.3 3.2 17 15.7 19.3 4 15.5 4.2 ...
## $ Low : logi NA NA NA NA NA NA ...
## $ High : logi NA NA NA NA NA NA ...
## $ Comments : logi NA NA NA NA NA NA ...
# run linear model with Numeric vs Year for male and females
male.fit <- lm(Numeric ~ Year, data = hunger[hunger$Sex == "Male", ])
female.fit <- lm(Numeric ~ Year, data = hunger[hunger$Sex == "Female", ])
# plot % hungry vs the year
plot(Numeric ~ Year, data = hunger, pch = 19, col=(Sex=="Male")*1+1)
# plot regression lines for both
abline(male.fit, lwd = 3, col = "black")
abline(female.fit, lwd = 3, col = "red")
model for % hungry (\(H\)) vs year (\(Y\)) and dummy variable for sex (\(X\)) is \[H_{i} = \beta_{0} + \beta_{1}X_i + \beta_{2} Y_{i} + \epsilon_{i}^*\]
abline(intercept, slope) = adds a line to the existing plot based on the intercept and slope provided
abline(lm) = plots the linear regression line on the plot# run linear model with Numeric vs Year and Sex
both.fit <- lm(Numeric ~ Year+Sex, data = hunger)
# print fit
both.fit$coef
## (Intercept) Year SexMale
## 633.528289 -0.308397 1.902743
# plot % hungry vs the year
plot(Numeric ~ Year, data = hunger, pch = 19, col=(Sex=="Male")*1+1)
# plot regression lines for both (same slope)
abline(both.fit$coef[1], both.fit$coef[2], lwd = 3, col = "black")
abline(both.fit$coef[1]+both.fit$coef[3], both.fit$coef[2], lwd = 3, col = "red")
lm(outcome ~ var1*var2) = whenever an interaction is specified in lm function using the * operator, the individual terms are added automatically
lm(outcome ~ var1+var2+var1*var2) = builds the exact same modellm(outcome ~ var1:var2) = builds linear model with only the interaction term (specified by : operator)# run linear model with Numeric vs Year and Sex and interaction term
interaction.fit <- lm(Numeric ~ Year*Sex, data = hunger)
# print fit
interaction.fit$coef
## (Intercept) Year SexMale Year:SexMale
## 603.50579986 -0.29339638 61.94771998 -0.03000132
# plot % hungry vs the year
plot(Numeric ~ Year, data = hunger, pch = 19, col=(Sex=="Male")*1+1)
# plot regression lines for both (different slope)
abline(interaction.fit$coef[1], interaction.fit$coef[2], lwd = 3, col = "black")
abline(interaction.fit$coef[1]+interaction.fit$coef[3],
interaction.fit$coef[2]+interaction.fit$coef[4], lwd = 3, col = "red")
# generate some income data
hunger$Income <- 1:nrow(hunger)*10 + 500*runif(nrow(hunger), 0, 10) +
runif(nrow(hunger), 0, 500)^1.5
# run linear model with Numeric vs Year and Income and interaction term
lm(Numeric ~ Year*Income, data = hunger)$coef
## (Intercept) Year Income Year:Income
## 8.076214e+02 -3.945413e-01 -1.517852e-02 7.552467e-06
lm(y ~ x + t)# simulate data
n <- 100; t <- rep(c(0, 1), c(n/2, n/2)); x <- c(runif(n/2), runif(n/2));
# define parameters/coefficients
beta0 <- 0; beta1 <- 2; beta2 <- 1; sigma <- .2
# generate outcome using linear model
y <- beta0 + x * beta1 + t * beta2 + rnorm(n, sd = sigma)
# set up axes
plot(x, y, type = "n", frame = FALSE)
# plot linear fit of y vs x
abline(lm(y ~ x), lwd = 2, col = "blue")
# plot means of the two groups (t = 0 vs t = 1)
abline(h = mean(y[1 : (n/2)]), lwd = 3, col = "red")
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3, col = "red")
# plot linear fit of y vs x and t
fit <- lm(y ~ x + t)
# plot the two lines corresponding to (t = 0 vs t = 1)
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3)
# add in the actual data points
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 1)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 1)
# print treatment and adjustment effects
rbind("Treatment Effect" = lm(y~t+x)$coef[2], "Adjustment Effect" = lm(y~t)$coef[2])
## t
## Treatment Effect 1.0205044
## Adjustment Effect 0.9777035
lm(y ~ t)lm(y ~ t + x)lm(y ~ x)# simulate data
n <- 100; t <- rep(c(0, 1), c(n/2, n/2)); x <- c(runif(n/2), 1.5 + runif(n/2));
# define parameters/coefficients
beta0 <- 0; beta1 <- 2; beta2 <- 0; sigma <- .2
# generate outcome using linear model
y <- beta0 + x * beta1 + t * beta2 + rnorm(n, sd = sigma)
# set up axes
plot(x, y, type = "n", frame = FALSE)
# plot linear fit of y vs x
abline(lm(y ~ x), lwd = 2, col = "blue")
# plot means of the two groups (t = 0 vs t = 1)
abline(h = mean(y[1 : (n/2)]), lwd = 3, col = "red")
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3, col = "red")
# plot linear fit of y vs x and t
fit <- lm(y ~ x + t)
# plot the two lines corresponding to (t = 0 vs t = 1)
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3)
# add in the actual data points
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 1)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 1)
# print treatment and adjustment effects
rbind("Treatment Effect" = lm(y~t+x)$coef[2], "Adjustment Effect" = lm(y~t)$coef[2])
## t
## Treatment Effect -0.178271
## Adjustment Effect 3.159216
lm(y ~ t)lm(y ~ t + x)
lm(y ~ x) since coefficient of \(t\) or \(\beta_2 = 0\)lm(y ~ x)
# simulate data
n <- 100; t <- rep(c(0, 1), c(n/2, n/2)); x <- c(runif(n/2), .9 + runif(n/2));
# define parameters/coefficients
beta0 <- 0; beta1 <- 2; beta2 <- -1; sigma <- .2
# generate outcome using linear model
y <- beta0 + x * beta1 + t * beta2 + rnorm(n, sd = sigma)
# set up axes
plot(x, y, type = "n", frame = FALSE)
# plot linear fit of y vs x
abline(lm(y ~ x), lwd = 2, col = "blue")
# plot means of the two groups (t = 0 vs t = 1)
abline(h = mean(y[1 : (n/2)]), lwd = 3, col = "red")
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3, col = "red")
# plot linear fit of y vs x and t
fit <- lm(y ~ x + t)
# plot the two lines corresponding to (t = 0 vs t = 1)
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3)
# add in the actual data points
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 1)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 1)
# print treatment and adjustment effects
rbind("Treatment Effect" = lm(y~t+x)$coef[2], "Adjustment Effect" = lm(y~t)$coef[2])
## t
## Treatment Effect -0.9977156
## Adjustment Effect 0.6441999
lm(y ~ t)lm(y ~ t + x)lm(y ~ x)# simulate data
n <- 100; t <- rep(c(0, 1), c(n/2, n/2)); x <- c(.5 + runif(n/2), runif(n/2));
# define parameters/coefficients
beta0 <- 0; beta1 <- 2; beta2 <- 1; sigma <- .2
# generate outcome using linear model
y <- beta0 + x * beta1 + t * beta2 + rnorm(n, sd = sigma)
# set up axes
plot(x, y, type = "n", frame = FALSE)
# plot linear fit of y vs x
abline(lm(y ~ x), lwd = 2, col = "blue")
# plot means of the two groups (t = 0 vs t = 1)
abline(h = mean(y[1 : (n/2)]), lwd = 3, col = "red")
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3, col = "red")
# plot linear fit of y vs x and t
fit <- lm(y ~ x + t)
# plot the two lines corresponding to (t = 0 vs t = 1)
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3)
# add in the actual data points
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 1)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 1)
# print treatment and adjustment effects
rbind("Treatment Effect" = lm(y~t+x)$coef[2], "Adjustment Effect" = lm(y~t)$coef[2])
## t
## Treatment Effect 0.963139626
## Adjustment Effect -0.006276381
lm(y ~ t)lm(y ~ t + x)lm(y ~ x)# simulate data
n <- 100; t <- rep(c(0, 1), c(n/2, n/2)); x <- c(runif(n/2, -1, 1), runif(n/2, -1, 1));
# define parameters/coefficients
beta0 <- 0; beta1 <- 2; beta2 <- 0; beta3 <- -4; sigma <- .2
# generate outcome using linear model
y <- beta0 + x * beta1 + t * beta2 + t * x * beta3 + rnorm(n, sd = sigma)
# set up axes
plot(x, y, type = "n", frame = FALSE)
# plot linear fit of y vs x
abline(lm(y ~ x), lwd = 2, col = "blue")
# plot means of the two groups (t = 0 vs t = 1)
abline(h = mean(y[1 : (n/2)]), lwd = 3, col = "red")
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3, col = "red")
# plot linear fit of y vs x and t and interaction term
fit <- lm(y ~ x + t + I(x * t))
# plot the two lines corresponding to (t = 0 vs t = 1)
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2] + coef(fit)[4], lwd = 3)
# add in the actual data points
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 1)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 1)
lm(y ~ t)lm(y ~ t + x + t*x)lm(y ~ x)
# simulate data
p <- 1; n <- 100; x2 <- runif(n); x1 <- p * runif(n) - (1 - p) * x2
# define parameters/coefficients
beta0 <- 0; beta1 <- 1; beta2 <- 4 ; sigma <- .01
# generate outcome using linear model
y <- beta0 + x1 * beta1 + beta2 * x2 + rnorm(n, sd = sigma)
# plot y vs x1 and x2
qplot(x1, y) + geom_point(aes(colour=x2)) + geom_smooth(method = lm)
rgl package to generate 3D plotsrgl::plot3d(x1, x2, y)
# plot the residuals for y and x1 with x2 removed
plot(resid(lm(x1 ~ x2)), resid(lm(y ~ x2)), frame = FALSE,
col = "black", bg = "lightblue", pch = 21, cex = 1)
# add linear fit line
abline(lm(I(resid(lm(y ~ x2))) ~ I(resid(lm(x1 ~ x2)))), lwd = 2)
fit <- lm(y~x), we can use the plot(fit) to produce a series of 4 diagnostic plots
# load swiss data and
data(swiss)
# run linear regression on Fertility vs all other predictors
fit <- lm(Fertility ~ . , data = swiss)
# generate diagnostic plots in 2 x 2 panels
par(mfrow = c(2, 2)); plot(fit)
# generate data
n <- 100; x <- rnorm(n); y <- x + rnorm(n, sd = .3)
# set up axes
plot(c(-3, 6), c(-3, 6), type = "n", frame = FALSE, xlab = "X", ylab = "Y")
# plot regression line for y vs x
abline(lm(y ~ x), lwd = 2)
# plot actual (x, y) pairs
points(x, y, cex = 1, bg = "lightblue", col = "black", pch = 21)
# plot 4 points of interest
points(0, 0, cex = 1.5, bg = "darkorange", col = "black", pch = 21)
points(0, 5, cex = 1.5, bg = "darkorange", col = "black", pch = 21)
points(5, 5, cex = 1.5, bg = "darkorange", col = "black", pch = 21)
points(5, 0, cex = 1.5, bg = "darkorange", col = "black", pch = 21)
stats package in R
?influence.measures in R will display the detailed documentation on all available functions to measure influence model argument referenced in the following functions is simply the linear fit model generated by the lm function (i.e. model <- lm(y~x) rstandard(model) = standardized residuals –> residuals divided by their standard deviationsrstudent(model) = standardized residuals –> residuals divided by their standard deviations, where the \(i^{th}\) data point was deleted in the calculation of the standard deviation for the residual to follow a t distributionhatvalues(model) = measures of leveragedffits(model) = change in the predicted response when the \(i^{th}\) point is deleted in fitting the model
dfbetas(model) = change in individual coefficients when the \(i^{th}\) point is deleted in fitting the model
cooks(model).distance = overall change in coefficients when the \(i^{th}\) point is deletedresid(model) = returns ordinary residualsresid(model)/(1-hatvalues(model)) = returns PRESS residuals (i.e. the leave one out cross validation residuals)
# generate random data and point (10, 10)
x <- c(10, rnorm(n)); y <- c(10, c(rnorm(n)))
# plot y vs x
plot(x, y, frame = FALSE, cex = 1, pch = 21, bg = "lightblue", col = "black")
# perform linear regression
fit <- lm(y ~ x)
# add regression line to plot
abline(fit)
dfbetas(fit) = difference in coefficients for including vs excluding each data point
n x m dataframe, where n = number of values in the original dataset, and m = number of coefficients# calculate the dfbetas for the slope the first 10 points
round(dfbetas(fit)[1 : 10, 2], 3)
## 1 2 3 4 5 6 7 8 9 10
## 7.114 0.004 -0.001 -0.011 0.001 0.007 0.048 -0.008 0.004 -0.004
as we can see from above, the slope coefficient would change dramatically if the first point (10, 10) is left out
hatvalues(fit) = measures the potential for influence for each point
# calculate the hat values for the first 10 points
round(hatvalues(fit)[1 : 10], 3)
## 1 2 3 4 5 6 7 8 9 10
## 0.468 0.011 0.010 0.010 0.018 0.020 0.046 0.019 0.010 0.010
# generate data
x <- rnorm(n); y <- x + rnorm(n, sd = .3)
# add an outlier that fits the relationship
x <- c(5, x); y <- c(5, y)
# plot the (x, y) pairs
plot(x, y, frame = FALSE, cex = 1, pch = 21, bg = "lightblue", col = "black")
# perform the linear regression
fit2 <- lm(y ~ x)
# add the regression line to the plot
abline(fit2)
# calculate the dfbetas for the slope the first 10 points
round(dfbetas(fit2)[1 : 10, 2], 3)
## 1 2 3 4 5 6 7 8 9 10
## -0.489 -0.266 0.003 0.057 -0.001 -0.068 -0.063 0.023 -0.045 -0.039
# calculate the hat values for the first 10 points
round(hatvalues(fit2)[1 : 10], 3)
## 1 2 3 4 5 6 7 8 9 10
## 0.216 0.032 0.010 0.015 0.010 0.011 0.013 0.013 0.011 0.026
dfbetas value (indication of low influence) but still has a substantial hatvalue (indication of high leverage)
# read data
data <- read.table('http://www4.stat.ncsu.edu/~stefanski/NSF_Supported/Hidden_Images/orly_owl_files/orly_owl_Lin_4p_5_flat.txt',
header = FALSE)
# construct pairwise plot
pairs(data)
# perform regression on V1 with all other predictors (omitting the intercept)
fit <- lm(V1 ~ . - 1, data = data)
# print the coefficient for linear model
summary(fit)$coef
## Estimate Std. Error t value Pr(>|t|)
## V2 0.9856157 0.12798121 7.701253 1.989126e-14
## V3 0.9714707 0.12663829 7.671225 2.500259e-14
## V4 0.8606368 0.11958267 7.197003 8.301184e-13
## V5 0.9266981 0.08328434 11.126919 4.778110e-28
# plot the residuals vs fitted values
plot(predict(fit), resid(fit), pch = '.')
“A model is a lense through which to look at your data” – Scott Zeger
“There’s no such thing as a correct model” – Brian Caffo
“There are known knowns. These are things we know that we know. There are known unknowns. That is to say, there are things that we know we don’t know. But there are also unknown unknowns. There are things we don’t know we don’t know.” – Donald Rumsfeld
y~x1+x2 = 1.5 and standard error for the \(\beta_1\) of y~x1 = 0.5, then the actual increase in standard error of the \(\beta_1\) = 1.5/0.5 - 1 = 200%# set number of measurements
n <- 100
# set up the axes of the plot
plot(c(1, n), 0 : 1, type = "n", xlab = "p", ylab = expression(R^2),
main = expression(paste(R^2, " vs n")))
# for each value of p from 1 to n
r <- sapply(1 : n, function(p){
# create outcome and p regressors
y <- rnorm(n); x <- matrix(rnorm(n * p), n, p)
# calculate the R^2
summary(lm(y ~ x))$r.squared
})
# plot the R^2 values and connect them with a line
lines(1 : n, r, lwd = 2)
swiss data set for this example, and compare the following models
# load swiss data
data(swiss)
# run linear regression for Fertility vs Agriculture
fit <- lm(Fertility ~ Agriculture, data = swiss)
# variance for coefficient of Agriculture
a <- summary(fit)$cov.unscaled[2,2]
# run linear regression for Fertility vs Agriculture + Examination
fit2 <- update(fit, Fertility ~ Agriculture + Examination)
# run linear regression for Fertility vs Agriculture + Examination + Education
fit3 <- update(fit, Fertility ~ Agriculture + Examination + Education)
# calculate ratios of variances for Agriculture coef for fit2 and fit3 w.r.t fit1
c(summary(fit2)$cov.unscaled[2,2], summary(fit3)$cov.unscaled[2,2]) / a - 1
## [1] 0.8915757 1.0891588
car library] vit(fit) = returns the variance inflation factors for the predictors of the given linear model# load car library
library(car)
# run linear regression on Fertility vs all other predictors
fit <- lm(Fertility ~ . , data = swiss)
# calculate the variance inflation factors
vif(fit)
## Agriculture Examination Education Catholic
## 2.284129 3.675420 2.774943 1.937160
## Infant.Mortality
## 1.107542
# calculate the standard error inflation factors
sqrt(vif(fit))
## Agriculture Examination Education Catholic
## 1.511334 1.917138 1.665816 1.391819
## Infant.Mortality
## 1.052398
anova(fit1, fit2, fit3) = performs ANOVA or analysis of variance (or deviance) tables for a series of nested linear regressions modelsstep(lm, k=df) = performs step wise regression on a given linear model to find and return best linear model
k=log(n) = specifying the value of k as the log of the number of observation will force the step-wise regression model to use Bayesian Information Criterion (BIC) instead of the AICMASS::stepAIC(lm, k = df) = more versatile, rigorous implementation of the step wise regressionswiss data set for this example, and compare the following nested models
# three different regressions that are nested
fit1 <- lm(Fertility ~ Agriculture, data = swiss)
fit3 <- update(fit, Fertility ~ Agriculture + Examination + Education)
fit5 <- update(fit, Fertility ~ Agriculture + Examination + Education + Catholic + Infant.Mortality)
# perform ANOVA
anova(fit1, fit3, fit5)
## Analysis of Variance Table
##
## Model 1: Fertility ~ Agriculture
## Model 2: Fertility ~ Agriculture + Examination + Education
## Model 3: Fertility ~ Agriculture + Examination + Education + Catholic +
## Infant.Mortality
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 45 6283.1
## 2 43 3180.9 2 3102.2 30.211 8.638e-09 ***
## 3 41 2105.0 2 1075.9 10.477 0.0002111 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Res.Df = residual degrees of freedom for the modelsRSS = residual sum of squares for the models, measure of fitDf = change in degrees of freedom from one model to the nextSum of Sq = difference/change in residual sum of squares from one model to the nextF = F statistic, measures the ratio of two scaled sums of squares reflecting different sources of variability \[F = \frac{\frac{RSS_1 - RSS_2}{p_2 - p_1}}{\frac{RSS_2}{n-p_2}}\] where \(p_1\) and \(p_2\) = number of parameters in the two models for comparison, and \(n\) = number of observationsPr(>F) = p-value for the F statistic to indicate whether the change in model is significant or notmtcars data set for this example, and perform step-wise regression/model selection algorithm on the following initial model
# load the mtcars data starting regression model
data(mtcars); fit <- lm(mpg ~ cyl + disp + hp + drat + wt, data = mtcars)
# step-wise search using BIC
step(fit, k = log(nrow(mtcars)))
## Start: AIC=73.75
## mpg ~ cyl + disp + hp + drat + wt
##
## Df Sum of Sq RSS AIC
## - drat 1 3.018 170.44 70.854
## - disp 1 6.949 174.38 71.584
## - cyl 1 15.411 182.84 73.100
## <none> 167.43 73.748
## - hp 1 21.066 188.49 74.075
## - wt 1 77.476 244.90 82.453
##
## Step: AIC=70.85
## mpg ~ cyl + disp + hp + wt
##
## Df Sum of Sq RSS AIC
## - disp 1 6.176 176.62 68.528
## - hp 1 18.048 188.49 70.609
## <none> 170.44 70.854
## - cyl 1 24.546 194.99 71.694
## - wt 1 90.925 261.37 81.069
##
## Step: AIC=68.53
## mpg ~ cyl + hp + wt
##
## Df Sum of Sq RSS AIC
## - hp 1 14.551 191.17 67.595
## - cyl 1 18.427 195.05 68.237
## <none> 176.62 68.528
## - wt 1 115.354 291.98 81.147
##
## Step: AIC=67.6
## mpg ~ cyl + wt
##
## Df Sum of Sq RSS AIC
## <none> 191.17 67.595
## - cyl 1 87.15 278.32 76.149
## - wt 1 117.16 308.33 79.426
##
## Call:
## lm(formula = mpg ~ cyl + wt, data = mtcars)
##
## Coefficients:
## (Intercept) cyl wt
## 39.686 -1.508 -3.191
mpg ~ cyl + wtglm(), its possible to specify for the model to solve using quasi-likelihood normal equations instead of normal equations through the parameter family = quasi-binomial and family = quasi-poisson respectivelyravenWinNum = 1 for Raven win, 0 for Raven lossravenWin = W for Raven win, L for Raven lossravenScore = score of the Raven team during the matchopponentScore = score of the Raven team during the match# load the data
load("ravensData.rda")
head(ravensData)
## ravenWinNum ravenWin ravenScore opponentScore
## 1 1 W 24 9
## 2 1 W 38 35
## 3 1 W 28 13
## 4 1 W 34 31
## 5 1 W 44 13
## 6 0 L 23 24
# perform linear regression
summary(lm(ravenWinNum ~ ravenScore, data = ravensData))
##
## Call:
## lm(formula = ravenWinNum ~ ravenScore, data = ravensData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7302 -0.5076 0.1824 0.3215 0.5719
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.285032 0.256643 1.111 0.2814
## ravenScore 0.015899 0.009059 1.755 0.0963 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4464 on 18 degrees of freedom
## Multiple R-squared: 0.1461, Adjusted R-squared: 0.09868
## F-statistic: 3.08 on 1 and 18 DF, p-value: 0.09625
manupulate function vary \(\beta_0\) and \(\beta_1\) to fit logistic regression curves for simulated data# set x values for the points to be plotted
x <- seq(-10, 10, length = 1000)
# "library(manipulate)" is needed to use the manipulate function
manipulate(
# plot the logistic regression curve
plot(x, exp(beta0 + beta1 * x) / (1 + exp(beta0 + beta1 * x)),
type = "l", lwd = 3, frame = FALSE),
# slider for beta1
beta1 = slider(-2, 2, step = .1, initial = 2),
# slider for beta0
beta0 = slider(-2, 2, step = .1, initial = 0)
)
* we can use the
glm(outcome ~ predictor, family = "binomial") to fit a logistic regression to the data
# run logistic regression on data
logRegRavens <- glm(ravenWinNum ~ ravenScore, data = ravensData,family="binomial")
# print summary
summary(logRegRavens)
##
## Call:
## glm(formula = ravenWinNum ~ ravenScore, family = "binomial",
## data = ravensData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7575 -1.0999 0.5305 0.8060 1.4947
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.68001 1.55412 -1.081 0.28
## ravenScore 0.10658 0.06674 1.597 0.11
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24.435 on 19 degrees of freedom
## Residual deviance: 20.895 on 18 degrees of freedom
## AIC: 24.895
##
## Number of Fisher Scoring iterations: 5
# take e^coefs to find the log ratios
exp(logRegRavens$coeff)
## (Intercept) ravenScore
## 0.1863724 1.1124694
# take e^log confidence interval to find the confidence intervals
exp(confint(logRegRavens))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.005674966 3.106384
## ravenScore 0.996229662 1.303304
# plot the logistic regression
plot(ravensData$ravenScore,logRegRavens$fitted,pch=19,col="blue",xlab="Score",ylab="Prob Ravens Win")
# perform analysis of variance
anova(logRegRavens,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: ravenWinNum
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 19 24.435
## ravenScore 1 3.5398 18 20.895 0.05991 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Df = change in degrees of freedom
ravenScore parameter (slope)Deviance = measure of goodness of model fit compare to the previous modelResid. Dev = residual deviance for current modelPr(>Chi) = used to evaluate the significance of the added parameter
Deviance value of 3.54 is used to find the corresponding p-value from the Chi Squared distribution, which is 0.06
# set up 1x3 panel plot
par(mfrow = c(1, 3))
# Poisson distribution for t = 1, and lambda = 2
plot(0 : 10, dpois(0 : 10, lambda = 2), type = "h", frame = FALSE)
# Poisson distribution for t = 1, and lambda = 10
plot(0 : 20, dpois(0 : 20, lambda = 10), type = "h", frame = FALSE)
# Poisson distribution for t = 1, and lambda = 100
plot(0 : 200, dpois(0 : 200, lambda = 100), type = "h", frame = FALSE)
rga package that can be found at http://skardhamar.github.com/rga/# laod data
load("gaData.rda")
# convert the dates to proper formats
gaData$julian <- julian(gaData$date)
# plot visits vs dates
plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits")
# plot the visits vs dates
plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits")
# perform linear regression
lm1 <- lm(gaData$visits ~ gaData$julian)
# plot regression line
abline(lm1,col="red",lwd=3)
round(exp(coef(lm(I(log(gaData$visits + 1)) ~ gaData$julian))), 5)
## (Intercept) gaData$julian
## 0.00000 1.00231
glm(outcome~predictor, family = "poisson") = performs Poisson regression# plot visits vs dates
plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits")
# construct Poisson regression model
glm1 <- glm(gaData$visits ~ gaData$julian,family="poisson")
# plot linear regression line in red
abline(lm1,col="red",lwd=3)
# plot Poisson regression line in
lines(gaData$julian,glm1$fitted,col="blue",lwd=3)
# plot residuals vs fitted values
plot(glm1$fitted,glm1$residuals,pch=19,col="grey",ylab="Residuals",xlab="Fitted")
glm(outcome~predictor, family = "quasi-poisson") = introduces an additional multiplicative factor \(\phi\) to denominator of model so that the variance is \(\phi \mu\) rather than just \(\mu\) (see Variances and Quasi-Likelihoods)sandwich package, is one way to calculate the robust standard errors
# load sandwich package
library(sandwich)
# compute
confint.agnostic <- function (object, parm, level = 0.95, ...)
{
cf <- coef(object); pnames <- names(cf)
if (missing(parm))
parm <- pnames
else if (is.numeric(parm))
parm <- pnames[parm]
a <- (1 - level)/2; a <- c(a, 1 - a)
pct <- stats:::format.perc(a, 3)
fac <- qnorm(a)
ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm,
pct))
ses <- sqrt(diag(sandwich::vcovHC(object)))[parm]
ci[] <- cf[parm] + ses %o% fac
ci
}
# regular confidence interval from Poisson Model
confint(glm1)
## 2.5 % 97.5 %
## (Intercept) -34.346577587 -31.159715656
## gaData$julian 0.002190043 0.002396461
# model agnostic standard errors
confint.agnostic(glm1)
## 2.5 % 97.5 %
## (Intercept) -36.362674594 -29.136997254
## gaData$julian 0.002058147 0.002527955
glm(outcome ~ predictor, offset = log(offset), family = "poisson") = perform Poisson regression with offsetglm(outcome ~ predictor + log(offset)) = produces the same result# perform Poisson regression with offset for number of visits
glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1),
family="poisson",data=gaData)
# plot the fitted means (from simply statistics)
plot(julian(gaData$date),glm2$fitted,col="blue",pch=19,xlab="Date",ylab="Fitted Counts")
# plot the fitted means (total visit)
points(julian(gaData$date),glm1$fitted,col="red",pch=19)
# plot the rates for simply stats
plot(julian(gaData$date),gaData$simplystats/(gaData$visits+1),col="grey",xlab="Date",
ylab="Fitted Rates",pch=19)
# plot the fitted rates for simply stats (visit/day)
lines(julian(gaData$date),glm2$fitted/(gaData$visits+1),col="blue",lwd=3)
log(visits) to address 0 values pscl package -
zeroinfl fits zero inflated Poisson (ziP) models to such data# simulate data
n <- 500; x <- seq(0, 4 * pi, length = n); y <- sin(x) + rnorm(n, sd = .3)
# define 20 knot points
knots <- seq(0, 8 * pi, length = 20);
# define the ()+ function to only take the values that are positive after the knot pt
splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot))
# define the predictors as X and spline term
xMat <- cbind(x, splineTerms)
# fit linear models for y vs predictors
yhat <- predict(lm(y ~ xMat))
# plot data points (x, y)
plot(x, y, frame = FALSE, pch = 21, bg = "lightblue")
# plot fitted values
lines(x, yhat, col = "red", lwd = 2)
# define the knot terms in the model
splineTerms <- sapply(knots, function(knot) (x > knot) * (x - knot)^2)
# define the predictors as x, x^2 and knot terms
xMat <- cbind(x, x^2, splineTerms)
# fit linear models for y vs predictors
yhat <- predict(lm(y ~ xMat))
# plot data points (x, y)
plot(x, y, frame = FALSE, pch = 21, bg = "lightblue")
# plot fitted values
lines(x, yhat, col = "red", lwd = 2)
# frequencies for white keys from c4 to c5
notes4 <- c(261.63, 293.66, 329.63, 349.23, 392.00, 440.00, 493.88, 523.25)
# generate sequence for 2 seconds
t <- seq(0, 2, by = .001); n <- length(t)
# define data for c4 e4 g4 using sine waves with their frequencies
c4 <- sin(2 * pi * notes4[1] * t); e4 <- sin(2 * pi * notes4[3] * t);
g4 <- sin(2 * pi * notes4[5] * t)
# define data for a chord and add a bit of noise
chord <- c4 + e4 + g4 + rnorm(n, 0, 0.3)
# generate profile data for all notes
x <- sapply(notes4, function(freq) sin(2 * pi * freq * t))
# fit the chord using the profiles for all notes
fit <- lm(chord ~ x - 1)
# set up plot
plot(c(0, 9), c(0, 1.5), xlab = "Note", ylab = "Coef^2", axes = FALSE, frame = TRUE, type = "n")
# set up axes
axis(2)
axis(1, at = 1 : 8, labels = c("c4", "d4", "e4", "f4", "g4", "a4", "b4", "c5"))
# add vertical lines for each note
for (i in 1 : 8) abline(v = i, lwd = 3, col = grey(.8))
# plot the linear regression fits
lines(c(0, 1 : 8, 9), c(0, coef(fit)^2, 0), type = "l", lwd = 3, col = "red")
fft(data) = performs fast Fourier transforms on provided dataRe(data) = subset to only the real components of the complex data# perform fast fourier transforms on the chord matrix
a <- fft(chord)
# plot only the real components of the fft
plot(Re(a)^2, type = "l")